1 Effect of UPSTM-Based Decorrelation on Feature Discovery

1.0.1 Loading the libraries

library("FRESA.CAD")
library(readxl)
library(igraph)
library(umap)
library(tsne)
library(entropy)
library(TH.data)
library(psych)
library(whitening)
library("vioplot")
library("rpart")
library(mlbench)

op <- par(no.readonly = TRUE)
pander::panderOptions('digits', 3)
pander::panderOptions('table.split.table', 400)
pander::panderOptions('keep.trailing.zeros',TRUE)

1.1 Material and Methods

Source W. Nick Street, Olvi L. Mangasarian and William H. Wolberg (1995). An inductive learning approach to prognostic prediction. In A. Prieditis and S. Russell, editors, Proceedings of the Twelfth International Conference on Machine Learning, pages 522–530, San Francisco, Morgan Kaufmann.

Peter Buehlmann and Torsten Hothorn (2007), Boosting algorithms: regularization, prediction and model fitting. Statistical Science, 22(4), 477–505.

1.2 The Data

wpbc {TH.data}


data("wpbc", package = "TH.data")
table(wpbc[,"status"])
#> 
#>   N   R 
#> 151  47
sum(1*(wpbc[,"status"]=="R" &  wpbc$time <= 24))
#> [1] 29
wpbc <- subset(wpbc,time > 36 | status=="R" )
summary(wpbc$time)
#>    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
#>    1.00   36.75   60.50   58.79   78.75  125.00
wpbc[,"status"] <- 1*(wpbc[,"status"]=="R")
wpbc <- wpbc[complete.cases(wpbc),]
pander::pander(table(wpbc[,"status"]))
0 1
91 46
wpbc$time <- NULL

1.2.0.1 Standarize the names for the reporting

studyName <- "Wisconsin"
dataframe <- wpbc
outcome <- "status"
thro <- 0.4
TopVariables <- 10
cexheat = 0.25

1.3 Generaring the report

1.3.1 Libraries

Some libraries

library(psych)
library(whitening)
library("vioplot")
library("rpart")

1.3.2 Data specs

pander::pander(c(rows=nrow(dataframe),col=ncol(dataframe)-1))
rows col
137 32
pander::pander(table(dataframe[,outcome]))
0 1
91 46

varlist <- colnames(dataframe)
varlist <- varlist[varlist != outcome]

largeSet <- length(varlist) > 1500 

1.3.3 Scaling the data

Scaling and removing near zero variance columns and highly co-linear(r>0.99999) columns


  ### Some global cleaning
  sdiszero <- apply(dataframe,2,sd) > 1.0e-16
  dataframe <- dataframe[,sdiszero]

  varlist <- colnames(dataframe)[colnames(dataframe) != outcome]
  tokeep <- c(as.character(correlated_Remove(dataframe,varlist,thr=0.99999)),outcome)
  dataframe <- dataframe[,tokeep]

  varlist <- colnames(dataframe)
  varlist <- varlist[varlist != outcome]
  
  iscontinous <- sapply(apply(dataframe,2,unique),length) >= 5 ## Only variables with enough samples



dataframeScaled <- FRESAScale(dataframe,method="OrderLogit")$scaledData

1.4 The heatmap of the data

numsub <- nrow(dataframe)
if (numsub > 1000) numsub <- 1000


if (!largeSet)
{

  hm <- heatMaps(data=dataframeScaled[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 xlab="Feature",
                 ylab="Sample",
                 srtCol=45,
                 srtRow=45,
                 cexCol=cexheat,
                 cexRow=cexheat
                 )
  par(op)
}

1.4.0.1 Correlation Matrix of the Data

The heat map of the data


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  #cormat <- Rfast::cora(as.matrix(dataframe[,varlist]),large=TRUE)
  cormat <- cor(dataframe[,varlist],method="pearson")
  cormat[is.na(cormat)] <- 0
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Original Correlation",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.9961379

1.5 The decorrelation


DEdataframe <- IDeA(dataframe,verbose=TRUE,thr=thro)
#> 
#>  mean_perimeter mean_texture tsize 
#>      mean_radius     mean_texture   mean_perimeter        mean_area 
#>          0.96875          0.50000          1.00000          0.93750 
#>  mean_smoothness mean_compactness 
#>          0.34375          0.40625 
#> 
#>  Included: 32 , Uni p: 0.0046875 , Base Size: 3 , Rcrit: 0.2212374 
#> 
#> 
 1 <R=0.996,thr=0.950>, Top: 3< 2 >[Fa= 3 ]( 3 , 6 , 0 ),<|>Tot Used: 9 , Added: 6 , Zero Std: 0 , Max Cor: 0.922
#> 
 2 <R=0.922,thr=0.900>, Top: 1< 1 >[Fa= 3 ]( 1 , 1 , 3 ),<|>Tot Used: 9 , Added: 1 , Zero Std: 0 , Max Cor: 0.891
#> 
 3 <R=0.891,thr=0.800>, Top: 6< 1 >[Fa= 9 ]( 6 , 6 , 3 ),<|>Tot Used: 18 , Added: 6 , Zero Std: 0 , Max Cor: 0.842
#> 
 4 <R=0.842,thr=0.800>, Top: 1< 1 >[Fa= 10 ]( 1 , 1 , 9 ),<|>Tot Used: 20 , Added: 1 , Zero Std: 0 , Max Cor: 0.789
#> 
 5 <R=0.789,thr=0.700>, Top: 6< 1 >[Fa= 12 ]( 6 , 6 , 10 ),<|>Tot Used: 25 , Added: 6 , Zero Std: 0 , Max Cor: 0.743
#> 
 6 <R=0.743,thr=0.700>, Top: 2< 1 >[Fa= 13 ]( 2 , 2 , 12 ),<|>Tot Used: 27 , Added: 2 , Zero Std: 0 , Max Cor: 0.698
#> 
 7 <R=0.698,thr=0.600>, Top: 3< 2 >[Fa= 13 ]( 3 , 4 , 13 ),<|>Tot Used: 27 , Added: 4 , Zero Std: 0 , Max Cor: 0.768
#> 
 8 <R=0.768,thr=0.700>, Top: 2< 1 >[Fa= 13 ]( 2 , 2 , 13 ),<|>Tot Used: 27 , Added: 2 , Zero Std: 0 , Max Cor: 0.600
#> 
 9 <R=0.600,thr=0.600>, Top: 1< 1 >[Fa= 13 ]( 1 , 1 , 13 ),<|>Tot Used: 27 , Added: 1 , Zero Std: 0 , Max Cor: 0.749
#> 
 10 <R=0.749,thr=0.700>, Top: 1< 1 >[Fa= 13 ]( 1 , 1 , 13 ),<|>Tot Used: 27 , Added: 1 , Zero Std: 0 , Max Cor: 0.600
#> 
 11 <R=0.600,thr=0.500>, Top: 7< 1 >[Fa= 15 ]( 7 , 8 , 13 ),<|>Tot Used: 28 , Added: 8 , Zero Std: 0 , Max Cor: 0.685
#> 
 12 <R=0.685,thr=0.600>, Top: 2< 1 >[Fa= 15 ]( 2 , 2 , 15 ),<|>Tot Used: 28 , Added: 2 , Zero Std: 0 , Max Cor: 0.607
#> 
 13 <R=0.607,thr=0.600>, Top: 1< 1 >[Fa= 15 ]( 1 , 1 , 15 ),<|>Tot Used: 28 , Added: 1 , Zero Std: 0 , Max Cor: 0.580
#> 
 14 <R=0.580,thr=0.500>, Top: 1< 1 >[Fa= 16 ]( 1 , 1 , 15 ),<|>Tot Used: 28 , Added: 1 , Zero Std: 0 , Max Cor: 0.500
#> 
 15 <R=0.500,thr=0.400>, Top: 9< 1 >[Fa= 18 ]( 8 , 11 , 16 ),<|>Tot Used: 32 , Added: 11 , Zero Std: 0 , Max Cor: 0.573
#> 
 16 <R=0.573,thr=0.500>, Top: 2< 1 >[Fa= 18 ]( 2 , 2 , 18 ),<|>Tot Used: 32 , Added: 2 , Zero Std: 0 , Max Cor: 0.548
#> 
 17 <R=0.548,thr=0.500>, Top: 1< 1 >[Fa= 18 ]( 1 , 1 , 18 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.630
#> 
 18 <R=0.630,thr=0.600>, Top: 1< 1 >[Fa= 19 ]( 1 , 1 , 18 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.493
#> 
 19 <R=0.493,thr=0.400>, Top: 5< 1 >[Fa= 20 ]( 5 , 6 , 19 ),<|>Tot Used: 32 , Added: 6 , Zero Std: 0 , Max Cor: 0.512
#> 
 20 <R=0.512,thr=0.500>, Top: 1< 1 >[Fa= 21 ]( 1 , 1 , 20 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.469
#> 
 21 <R=0.469,thr=0.400>, Top: 7< 1 >[Fa= 22 ]( 4 , 4 , 21 ),<|>Tot Used: 32 , Added: 4 , Zero Std: 0 , Max Cor: 0.501
#> 
 22 <R=0.501,thr=0.500>, Top: 1< 1 >[Fa= 22 ]( 1 , 1 , 22 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.488
#> 
 23 <R=0.488,thr=0.400>, Top: 2< 1 >[Fa= 22 ]( 2 , 2 , 22 ),<|>Tot Used: 32 , Added: 2 , Zero Std: 0 , Max Cor: 0.398
#> 
 24 <R=0.398,thr=0.400>
#> 
 [ 24 ], 0.3981401 Decor Dimension: 32 Nused: 32 . Cor to Base: 24 , ABase: 32 , Outcome Base: 0 
#> 
varlistc <- colnames(DEdataframe)[colnames(DEdataframe) != outcome]

pander::pander(sum(apply(dataframe[,varlist],2,var)))

515156

pander::pander(sum(apply(DEdataframe[,varlistc],2,var)))

6371

pander::pander(entropy(discretize(unlist(dataframe[,varlist]), 256)))

1.39

pander::pander(entropy(discretize(unlist(DEdataframe[,varlistc]), 256)))

1.3

1.5.1 The decorrelation matrix


if (!largeSet)
{

  par(cex=0.6,cex.main=0.85,cex.axis=0.7)
  
  UPLTM <- attr(DEdataframe,"UPLTM")
  
  gplots::heatmap.2(1.0*(abs(UPLTM)>0),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
  par(op)
  
  
  
}

1.5.2 Formulas Network

Displaying the features associations

par(op)

if (ncol(dataframe) < 1000)
{

  DEdataframeB <- ILAA(dataframe,verbose=TRUE,thr=thro,bootstrap=30)

  transform <- 1*(attr(DEdataframeB,"UPLTM") != 0)
  print(ncol(transform))
  thrcol <- 1 + 0.025*nrow(transform)
  rsum <- apply(1*(transform !=0),1,sum) > 2
  csum <- apply(1*(transform !=0),2,sum) > thrcol | rsum
  transform <- transform[csum,csum]
  csum <- (apply(1*(transform !=0),2,sum) > 1) & (apply(1*(transform !=0),1,sum) > 1)
  transform <- transform[csum,csum]
  print(ncol(transform))
  if (ncol(transform)>100)
  {
    thrcol <- 1 + 0.10*nrow(transform)
    rsum <- apply(1*(transform !=0),1,sum) > 4
    csum <- apply(1*(transform !=0),2,sum) > thrcol | rsum
    transform <- transform[csum,csum]
    csum <- (apply(1*(transform !=0),2,sum) > 3) & (apply(1*(transform !=0),1,sum) > 3)
    transform <- transform[csum,csum]
  }
  print(ncol(transform))
  if (ncol(transform)>100)
  {
    thrcol <- 1 + 0.20*nrow(transform)
    rsum <- apply(1*(transform !=0),1,sum) > 8
    csum <- apply(1*(transform !=0),2,sum) > thrcol | rsum
    transform <- transform[csum,csum]
    csum <- (apply(1*(transform !=0),2,sum) > 7) & (apply(1*(transform !=0),1,sum) > 7)
    transform <- transform[csum,csum]
  }
  print(ncol(transform))

  if ((ncol(transform) > 10) && (ncol(transform) < 150))
  {
    
      gplots::heatmap.2(transform,
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Red Decorrelation matrix",
                    cexRow = cexheat,
                    cexCol = cexheat,
                   srtCol=45,
                   srtRow=45,
                    key.title=NA,
                    key.xlab="|Beta|>0",
                    xlab="Output Feature", ylab="Input Feature")
  
      par(op)

    
    colnames(transform) <- str_remove_all(colnames(transform),"La_")
    
    VertexSize <- apply(transform,2,mean)
    VertexSize <- 5*VertexSize/max(VertexSize)
    
    gr <- graph_from_adjacency_matrix(transform,mode = "directed",diag = FALSE,weighted=TRUE)
    gr$layout <- layout_with_fr
    
    fc <- cluster_optimal(gr)
    plot(fc, gr,
         edge.width = 0.5*E(gr)$weight,
         vertex.size=VertexSize,
         edge.arrow.size=0.5,
         edge.arrow.width=0.5,
         vertex.label.cex=0.65,
         vertex.label.dist=1,
         main="Feature Association")
  }
}
#> fast | LM |
#>  mean_perimeter mean_texture tsize 
#>      mean_radius     mean_texture   mean_perimeter        mean_area 
#>          0.96875          0.50000          1.00000          0.93750 
#>  mean_smoothness mean_compactness 
#>          0.34375          0.40625 
#> 
#>  Included: 32 , Uni p: 0.0046875 , Base Size: 3 , Rcrit: 0.2212374 
#> 
#> 
 1 <R=0.996,thr=0.950>, Top: 3< 2 >[Fa= 3 ]( 3 , 6 , 0 ),<|>Tot Used: 9 , Added: 6 , Zero Std: 0 , Max Cor: 0.922
#> 
 2 <R=0.922,thr=0.900>, Top: 1< 1 >[Fa= 3 ]( 1 , 1 , 3 ),<|>Tot Used: 9 , Added: 1 , Zero Std: 0 , Max Cor: 0.891
#> 
 3 <R=0.891,thr=0.800>, Top: 6< 1 >[Fa= 9 ]( 6 , 6 , 3 ),<|>Tot Used: 18 , Added: 6 , Zero Std: 0 , Max Cor: 0.842
#> 
 4 <R=0.842,thr=0.800>, Top: 1< 1 >[Fa= 10 ]( 1 , 1 , 9 ),<|>Tot Used: 20 , Added: 1 , Zero Std: 0 , Max Cor: 0.789
#> 
 5 <R=0.789,thr=0.700>, Top: 6< 1 >[Fa= 12 ]( 6 , 6 , 10 ),<|>Tot Used: 25 , Added: 6 , Zero Std: 0 , Max Cor: 0.743
#> 
 6 <R=0.743,thr=0.700>, Top: 2< 1 >[Fa= 13 ]( 2 , 2 , 12 ),<|>Tot Used: 27 , Added: 2 , Zero Std: 0 , Max Cor: 0.698
#> 
 7 <R=0.698,thr=0.600>, Top: 3< 2 >[Fa= 13 ]( 3 , 4 , 13 ),<|>Tot Used: 27 , Added: 4 , Zero Std: 0 , Max Cor: 0.768
#> 
 8 <R=0.768,thr=0.700>, Top: 2< 1 >[Fa= 13 ]( 2 , 2 , 13 ),<|>Tot Used: 27 , Added: 2 , Zero Std: 0 , Max Cor: 0.600
#> 
 9 <R=0.600,thr=0.600>, Top: 1< 1 >[Fa= 13 ]( 1 , 1 , 13 ),<|>Tot Used: 27 , Added: 1 , Zero Std: 0 , Max Cor: 0.749
#> 
 10 <R=0.749,thr=0.700>, Top: 1< 1 >[Fa= 13 ]( 1 , 1 , 13 ),<|>Tot Used: 27 , Added: 1 , Zero Std: 0 , Max Cor: 0.600
#> 
 11 <R=0.600,thr=0.500>, Top: 7< 1 >[Fa= 15 ]( 7 , 8 , 13 ),<|>Tot Used: 28 , Added: 8 , Zero Std: 0 , Max Cor: 0.685
#> 
 12 <R=0.685,thr=0.600>, Top: 2< 1 >[Fa= 15 ]( 2 , 2 , 15 ),<|>Tot Used: 28 , Added: 2 , Zero Std: 0 , Max Cor: 0.607
#> 
 13 <R=0.607,thr=0.600>, Top: 1< 1 >[Fa= 15 ]( 1 , 1 , 15 ),<|>Tot Used: 28 , Added: 1 , Zero Std: 0 , Max Cor: 0.580
#> 
 14 <R=0.580,thr=0.500>, Top: 1< 1 >[Fa= 16 ]( 1 , 1 , 15 ),<|>Tot Used: 28 , Added: 1 , Zero Std: 0 , Max Cor: 0.500
#> 
 15 <R=0.500,thr=0.400>, Top: 9< 1 >[Fa= 18 ]( 8 , 11 , 16 ),<|>Tot Used: 32 , Added: 11 , Zero Std: 0 , Max Cor: 0.573
#> 
 16 <R=0.573,thr=0.500>, Top: 2< 1 >[Fa= 18 ]( 2 , 2 , 18 ),<|>Tot Used: 32 , Added: 2 , Zero Std: 0 , Max Cor: 0.548
#> 
 17 <R=0.548,thr=0.500>, Top: 1< 1 >[Fa= 18 ]( 1 , 1 , 18 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.630
#> 
 18 <R=0.630,thr=0.600>, Top: 1< 1 >[Fa= 19 ]( 1 , 1 , 18 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.493
#> 
 19 <R=0.493,thr=0.400>, Top: 5< 1 >[Fa= 20 ]( 5 , 6 , 19 ),<|>Tot Used: 32 , Added: 6 , Zero Std: 0 , Max Cor: 0.512
#> 
 20 <R=0.512,thr=0.500>, Top: 1< 1 >[Fa= 21 ]( 1 , 1 , 20 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.469
#> 
 21 <R=0.469,thr=0.400>, Top: 7< 1 >[Fa= 22 ]( 4 , 4 , 21 ),<|>Tot Used: 32 , Added: 4 , Zero Std: 0 , Max Cor: 0.501
#> 
 22 <R=0.501,thr=0.500>, Top: 1< 1 >[Fa= 22 ]( 1 , 1 , 22 ),<|>Tot Used: 32 , Added: 1 , Zero Std: 0 , Max Cor: 0.488
#> 
 23 <R=0.488,thr=0.400>, Top: 2< 1 >[Fa= 22 ]( 2 , 2 , 22 ),<|>Tot Used: 32 , Added: 2 , Zero Std: 0 , Max Cor: 0.398
#> 
 24 <R=0.398,thr=0.400>
#> 
 [ 24 ], 0.3981401 Decor Dimension: 32 Nused: 32 . Cor to Base: 24 , ABase: 32 , Outcome Base: 0 
#> 
bootstrapping->..............................
#> 
[1] 32
#> [1] 27
#> [1] 27
#> [1] 27


par(op)

1.6 The heatmap of the decorrelated data

if (!largeSet)
{

  hm <- heatMaps(data=DEdataframe[1:numsub,],
                 Outcome=outcome,
                 Scale=TRUE,
                 hCluster = "row",
                 cexRow = cexheat,
                 cexCol = cexheat,
                 srtCol=45,
                 srtRow=45,
                 xlab="Feature",
                 ylab="Sample")
  par(op)
}

1.7 The correlation matrix after decorrelation

if (!largeSet)
{

  cormat <- cor(DEdataframe[,varlistc],method="pearson")
  cormat[is.na(cormat)] <- 0
  
  gplots::heatmap.2(abs(cormat),
                    trace = "none",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "Correlation after ILAA",
                    cexRow = cexheat,
                    cexCol = cexheat,
                     srtCol=45,
                     srtRow=45,
                    key.title=NA,
                    key.xlab="|Pearson Correlation|",
                    xlab="Feature", ylab="Feature")
  
  par(op)
  diag(cormat) <- 0
  print(max(abs(cormat)))
}

[1] 0.3981401

1.8 U-MAP Visualization of features

1.8.1 The UMAP based on LASSO on Raw Data


if (nrow(dataframe) < 1000)
{
  classes <- unique(dataframe[1:numsub,outcome])
  raincolors <- rainbow(length(classes))
  names(raincolors) <- classes
  datasetframe.umap = umap(scale(dataframe[1:numsub,varlist]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: Original",t='n')
  text(datasetframe.umap$layout,labels=dataframe[1:numsub,outcome],col=raincolors[dataframe[1:numsub,outcome]+1])
}

1.8.2 The decorralted UMAP

if (nrow(dataframe) < 1000)
{

  datasetframe.umap = umap(scale(DEdataframe[1:numsub,varlistc]),n_components=2)
  plot(datasetframe.umap$layout,xlab="U1",ylab="U2",main="UMAP: After ILAA",t='n')
  text(datasetframe.umap$layout,labels=DEdataframe[1:numsub,outcome],col=raincolors[DEdataframe[1:numsub,outcome]+1])
}

1.9 Univariate Analysis

1.9.1 Univariate



univarRAW <- uniRankVar(varlist,
               paste(outcome,"~1"),
               outcome,
               dataframe,
               rankingTest="AUC")



univarDe <- uniRankVar(varlistc,
               paste(outcome,"~1"),
               outcome,
               DEdataframe,
               rankingTest="AUC",
               )

1.9.2 Final Table


univariate_columns <- c("caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC")

##top variables
topvar <- c(1:length(varlist)) <= TopVariables
tableRaw <- univarRAW$orderframe[topvar,univariate_columns]
pander::pander(tableRaw)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
tsize 3.47 2.03 2.64 1.86 1.11e-03 0.666
pnodes 4.87 6.02 2.63 5.21 6.25e-09 0.650
worst_radius 22.67 4.70 20.35 4.08 3.68e-01 0.647
worst_perimeter 151.33 32.42 135.34 26.85 5.71e-01 0.645
mean_area 1081.98 397.26 888.40 310.85 1.26e-01 0.645
worst_area 1635.77 703.15 1317.95 550.94 2.72e-01 0.643
mean_perimeter 121.10 22.91 110.02 19.19 4.72e-01 0.641
mean_radius 18.33 3.37 16.70 2.91 3.12e-01 0.639
SE_perimeter 4.73 2.21 3.81 1.80 6.37e-02 0.634
SE_area 81.97 53.36 61.22 37.72 6.46e-02 0.632


topLAvar <- univarDe$orderframe$Name[str_detect(univarDe$orderframe$Name,"La_")]
topLAvar <- unique(c(univarDe$orderframe$Name[topvar],topLAvar[1:as.integer(TopVariables/2)]))
finalTable <- univarDe$orderframe[topLAvar,univariate_columns]


pander::pander(finalTable)
  caseMean caseStd controlMean controlStd controlKSP ROCAUC
tsize 3.47174 2.02985 2.63846 1.85507 0.00111 0.666
La_worst_area 409.10338 59.42237 440.08923 60.65702 0.52089 0.653
La_SE_symmetry -0.00356 0.00448 -0.00579 0.00572 0.23435 0.645
mean_perimeter 121.09522 22.91019 110.02231 19.18940 0.47168 0.641
La_worst_fractaldim -0.09538 0.00580 -0.09896 0.00862 0.93594 0.639
La_mean_smoothness 0.09341 0.00508 0.09102 0.00555 0.90681 0.626
La_mean_fractaldim 0.08522 0.00324 0.08699 0.00471 0.35987 0.600
La_worst_perimeter -5.80502 11.69785 -7.42934 11.47409 0.02995 0.577
La_SE_concavity -0.01260 0.00681 -0.01037 0.00581 0.22238 0.573
La_worst_concavity 0.14327 0.09717 0.11997 0.08164 0.81386 0.572

dc <- getLatentCoefficients(DEdataframe)
fscores <- attr(DEdataframe,"fscore")


pander::pander(c(mean=mean(sapply(dc,length)),total=length(dc),fraction=length(dc)/(ncol(dataframe)-1)))
mean total fraction
4.79 29 0.906

theCharformulas <- attr(dc,"LatentCharFormulas")


finalTable <- rbind(finalTable,tableRaw[topvar[!(topvar %in% topLAvar)],univariate_columns])


orgnamez <- rownames(finalTable)
orgnamez <- str_remove_all(orgnamez,"La_")
finalTable$RAWAUC <- univarRAW$orderframe[orgnamez,"ROCAUC"]
finalTable$DecorFormula <- theCharformulas[rownames(finalTable)]
finalTable$fscores <- fscores[rownames(finalTable)]

Final_Columns <- c("DecorFormula","caseMean","caseStd","controlMean","controlStd","controlKSP","ROCAUC","RAWAUC","fscores")

finalTable <- finalTable[order(-finalTable$ROCAUC),]
pander::pander(finalTable[,Final_Columns])
  DecorFormula caseMean caseStd controlMean controlStd controlKSP ROCAUC RAWAUC fscores
tsize NA 3.47e+00 2.03e+00 2.64e+00 1.86e+00 1.11e-03 0.666 0.666 1
tsize1 NA 3.47e+00 2.03e+00 2.64e+00 1.86e+00 1.11e-03 0.666 NA NA
La_worst_area + (2.56e+02)mean_radius - (4.838)mean_perimeter - (2.009)mean_area - (1.39e+02)worst_radius + (0.035)worst_perimeter + worst_area 4.09e+02 5.94e+01 4.40e+02 6.07e+01 5.21e-01 0.653 0.643 -2
pnodes NA 4.87e+00 6.02e+00 2.63e+00 5.21e+00 6.25e-09 0.650 0.650 NA
worst_radius NA 2.27e+01 4.70e+00 2.03e+01 4.08e+00 3.68e-01 0.647 0.647 NA
worst_perimeter NA 1.51e+02 3.24e+01 1.35e+02 2.68e+01 5.71e-01 0.645 0.645 NA
La_SE_symmetry - (2.99e-03)mean_radius + (5.30e-04)mean_perimeter + (3.57e-03)mean_symmetry - (0.376)mean_fractaldim + SE_symmetry - (2.978)SE_fractaldim - (3.80e-03)worst_compactness - (0.099)worst_symmetry + (0.375)worst_fractaldim -3.56e-03 4.48e-03 -5.79e-03 5.72e-03 2.34e-01 0.645 0.504 -4
mean_area NA 1.08e+03 3.97e+02 8.88e+02 3.11e+02 1.26e-01 0.645 0.645 NA
worst_area NA 1.64e+03 7.03e+02 1.32e+03 5.51e+02 2.72e-01 0.643 0.643 NA
mean_perimeter NA 1.21e+02 2.29e+01 1.10e+02 1.92e+01 4.72e-01 0.641 0.641 9
mean_perimeter1 NA 1.21e+02 2.29e+01 1.10e+02 1.92e+01 4.72e-01 0.641 NA NA
La_worst_fractaldim - (0.029)mean_radius + (4.31e-03)mean_perimeter - (2.460)mean_fractaldim - (0.071)worst_compactness + worst_fractaldim -9.54e-02 5.80e-03 -9.90e-02 8.62e-03 9.36e-01 0.639 0.583 -1
mean_radius NA 1.83e+01 3.37e+00 1.67e+01 2.91e+00 3.12e-01 0.639 0.639 NA
SE_perimeter NA 4.73e+00 2.21e+00 3.81e+00 1.80e+00 6.37e-02 0.634 0.634 NA
SE_area NA 8.20e+01 5.34e+01 6.12e+01 3.77e+01 6.46e-02 0.632 0.632 NA
La_mean_smoothness - (0.012)mean_radius + (2.32e-03)mean_perimeter + mean_smoothness - (0.411)mean_concavepoints - (0.521)mean_fractaldim 9.34e-02 5.08e-03 9.10e-02 5.55e-03 9.07e-01 0.626 0.518 -1
La_mean_fractaldim + (0.019)mean_radius - (2.69e-03)mean_perimeter + mean_fractaldim 8.52e-02 3.24e-03 8.70e-02 4.71e-03 3.60e-01 0.600 0.615 5
La_worst_perimeter - (1.298)mean_perimeter + worst_perimeter -5.81e+00 1.17e+01 -7.43e+00 1.15e+01 2.99e-02 0.577 0.645 2
La_SE_concavity - (0.766)SE_compactness + SE_concavity - (1.245)SE_concavepoints + (0.455)SE_fractaldim + (0.064)worst_compactness - (0.077)worst_concavity -1.26e-02 6.81e-03 -1.04e-02 5.81e-03 2.22e-01 0.573 0.478 -2
La_worst_concavity - (0.830)worst_compactness + worst_concavity 1.43e-01 9.72e-02 1.20e-01 8.16e-02 8.14e-01 0.572 0.492 1

1.10 Comparing ILAA vs PCA vs EFA

1.10.1 PCA

featuresnames <- colnames(dataframe)[colnames(dataframe) != outcome]
pc <- prcomp(dataframe[,iscontinous],center = TRUE,scale. = TRUE)   #principal components
predPCA <- predict(pc,dataframe[,iscontinous])
PCAdataframe <- as.data.frame(cbind(predPCA,dataframe[,!iscontinous]))
colnames(PCAdataframe) <- c(colnames(predPCA),colnames(dataframe)[!iscontinous]) 
#plot(PCAdataframe[,colnames(PCAdataframe)!=outcome],col=dataframe[,outcome],cex=0.65,cex.lab=0.5,cex.axis=0.75,cex.sub=0.5,cex.main=0.75)

#pander::pander(pc$rotation)


PCACor <- cor(PCAdataframe[,colnames(PCAdataframe) != outcome])


  gplots::heatmap.2(abs(PCACor),
                    trace = "none",
  #                  scale = "row",
                    mar = c(5,5),
                    col=rev(heat.colors(5)),
                    main = "PCA Correlation",
                    cexRow = 0.5,
                    cexCol = 0.5,
                     srtCol=45,
                     srtRow= -45,
                    key.title=NA,
                    key.xlab="Pearson Correlation",
                    xlab="Feature", ylab="Feature")

1.10.2 EFA


EFAdataframe <- dataframeScaled

if (length(iscontinous) < 2000)
{
  topred <- min(length(iscontinous),nrow(dataframeScaled),ncol(predPCA)/2)
  if (topred < 2) topred <- 2
  
  uls <- fa(dataframeScaled[,iscontinous],nfactors=topred,rotate="varimax",warnings=FALSE)  # EFA analysis
  predEFA <- predict(uls,dataframeScaled[,iscontinous])
  EFAdataframe <- as.data.frame(cbind(predEFA,dataframeScaled[,!iscontinous]))
  colnames(EFAdataframe) <- c(colnames(predEFA),colnames(dataframeScaled)[!iscontinous]) 


  
  EFACor <- cor(EFAdataframe[,colnames(EFAdataframe) != outcome])
  
  
    gplots::heatmap.2(abs(EFACor),
                      trace = "none",
    #                  scale = "row",
                      mar = c(5,5),
                      col=rev(heat.colors(5)),
                      main = "EFA Correlation",
                      cexRow = 0.5,
                      cexCol = 0.5,
                       srtCol=45,
                       srtRow= -45,
                      key.title=NA,
                      key.xlab="Pearson Correlation",
                      xlab="Feature", ylab="Feature")
}

1.11 Effect on CAR modeling

par(op)
par(xpd = TRUE)
dataframe[,outcome] <- factor(dataframe[,outcome])
rawmodel <- rpart(paste(outcome,"~."),dataframe,control=rpart.control(maxdepth=3))
pr <- predict(rawmodel,dataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(rawmodel,main="Raw",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(rawmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,dataframe[,outcome]==0))
  }


pander::pander(table(dataframe[,outcome],pr))
  0 1
0 68 23
1 9 37
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.766 0.687 0.834
3 se 0.804 0.661 0.906
4 sp 0.747 0.645 0.833
6 diag.or 12.155 5.100 28.966

par(op)
par(xpd = TRUE)
DEdataframe[,outcome] <- factor(DEdataframe[,outcome])
IDeAmodel <- rpart(paste(outcome,"~."),DEdataframe,control=rpart.control(maxdepth=3))
pr <- predict(IDeAmodel,DEdataframe,type = "class")

  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(IDeAmodel,main="ILAA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(IDeAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,DEdataframe[,outcome]==0))
  }

pander::pander(table(DEdataframe[,outcome],pr))
  0 1
0 89 2
1 33 13
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.745 0.663 0.815
3 se 0.283 0.160 0.435
4 sp 0.978 0.923 0.997
6 diag.or 17.530 3.753 81.883

par(op)
par(xpd = TRUE)
PCAdataframe[,outcome] <- factor(PCAdataframe[,outcome])
PCAmodel <- rpart(paste(outcome,"~."),PCAdataframe,control=rpart.control(maxdepth=3))
pr <- predict(PCAmodel,PCAdataframe,type = "class")
ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
if (length(unique(pr))>1)
{
  plot(PCAmodel,main="PCA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
  text(PCAmodel, use.n = TRUE,cex=0.75)
  ptab <- epiR::epi.tests(table(pr==0,PCAdataframe[,outcome]==0))
}

pander::pander(table(PCAdataframe[,outcome],pr))
  0 1
0 86 5
1 27 19
pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.766 0.687 0.834
3 se 0.413 0.270 0.568
4 sp 0.945 0.876 0.982
6 diag.or 12.104 4.128 35.493


par(op)

1.11.1 EFA


  EFAdataframe[,outcome] <- factor(EFAdataframe[,outcome])
  EFAmodel <- rpart(paste(outcome,"~."),EFAdataframe,control=rpart.control(maxdepth=3))
  pr <- predict(EFAmodel,EFAdataframe,type = "class")
  
  ptab <- list(er="Error",detail=matrix(nrow=6,ncol=1))
  if (length(unique(pr))>1)
  {
    plot(EFAmodel,main="EFA",branch=0.5,uniform = TRUE,compress = TRUE,margin=0.1)
    text(EFAmodel, use.n = TRUE,cex=0.75)
    ptab <- epiR::epi.tests(table(pr==0,EFAdataframe[,outcome]==0))
  }


  pander::pander(table(EFAdataframe[,outcome],pr))
  0 1
0 90 1
1 38 8
  pander::pander(ptab$detail[c(5,3,4,6),])
  statistic est lower upper
5 diag.ac 0.715 0.6320 0.789
3 se 0.174 0.0782 0.314
4 sp 0.989 0.9403 1.000
6 diag.or 18.947 2.2899 156.776
  par(op)